/* schoof.c
   
   Elliptic curve order calculator, for

   y^2 = x^3 + a x + b (mod p)
   
   Compile with:

   % cc -O schoof.c giants.c tools.c ellproj.c -lm -o schoof

   and run, entering p and the a,b parameters.
   Eventually the curve order o is reported, together with
   the twist order o'.  The sum of these two orders is always
   2p + 2.  A final check is performed to verify that a
   random point (x,y,z) enjoys the annihilation
   o * (x,y,z) = point-at-infinity.  This is not absolutely
   definitive, rather it is a necessary condition on the oder o
   (i.e. it is a sanity check of sorts).

 * Change history:

     18 Nov 99   REC  fixed M. Scott bug (MAX_DIGS bumped by 2)
      5 Jul 99   REC  Installed improved (base-4) power ladder
      5 Jul 99   REC  Made use of binary segmentation square (per se)
      5 Jul 99   REC  Improved memory usage
      2 May 99   REC  Added initial primality check
     30 Apr 99   REC  Added actual order-annihilation test
     29 Apr 99   REC  Improvements due to A. Powell
      2 Feb 99   REC  Added explicit CRT result
     12 Jan 99   REC  Removed (hopefully) last of memory crashes
     20 Jan 98   REC  Creation

 *	c. 1998 Perfectly Scientific, Inc.
 *	All Rights Reserved.
 *
 *
 *************************************************************/

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include "giants.h"
#include "tools.h"
#include "ellproj.h"

#define P_BREAK 32

#ifdef _WIN32
#include <string.h>
#define bzero(D, n) memset(D, 0x00, n)
#define bcopy(S, D, n) memcpy(D, S, n)
#endif
  
#define Q_MAX 256       /* Bits in largest primes handled. */
#define L_CEIL 100       /* Bound on Schoof L values (not all needed in general). */


typedef struct
         {
         int deg;    
         giant *coe;
         } polystruct;   
typedef polystruct *poly;

extern int *pr;

static int Q, L_MAX;
static int MAX_DIGS, MAX_COEFFS;

static giant *mcand, coe, tmp, err, aux, aux2, globx, globy, t1, t2, 
	t3, t4, t5;
static poly qscratch, rscratch, sscratch, tscratch, pbuff,
     pbuff2, precip, cubic, powx, powy, kxn, kyn, kxd, kyd,
     txn, txd, tyn, tyd, txn1, txd1, tyn1, tyd1,
     nx, dx, ny, dy, mn, md, tmp1, tmp2, tmp3, tmp4;
static poly s[L_CEIL+2], smonic; 
static giant p, a, b;
static int L;

void quickmodg(giant g, giant x) 
{       int sgn = x->sign;

        if(sgn == 0) return;
        if(sgn > 0) {
                if(gcompg(x, g) >= 0) subg(g, x);
                return;
        }
        addg(g,x);
        return;
}

int
log_2(int n) {
        int c = 1, d = -1;
        while(c <= n) {
                c <<= 1;
                ++d;
        }
        return(d);   
}

void
justifyp(poly x) {
        int j;
        for(j = x->deg; j >= 0; j--) {
                if(!is0(x->coe[j])) break;
        }
        x->deg = (j>0)? j : 0;
}

void
polyrem(poly x) {
        int j;
   for(j=0; j <= x->deg; j++) {
      modg(p, x->coe[j]);
   }
   justifyp(x);
}


giant *
newa(int n) {
        giant *p = (giant *)malloc(n*sizeof(giant));
        int j;
        for(j=0; j<n; j++) {
                p[j] = newgiant(MAX_DIGS);
        }
        return(p);
}

poly
newpoly(int coeffs) {
        poly pol;
        pol = (poly) malloc(sizeof(polystruct));
        pol->coe = (giant *)newa(coeffs);
        return(pol);
}

void
init_all() {
   int j;
   
   j = (2*Q + log_2(MAX_COEFFS) + 32 + 15)/16;
   j = j * MAX_COEFFS;
   globx = newgiant(j);
   globy = newgiant(j);
   s[0] = newpoly(MAX_COEFFS);

   for(j=1; j<=L_MAX+1; j++) {
                s[j] = newpoly(j*(j+1));
   }
   smonic = newpoly(MAX_COEFFS);
   powx = newpoly(MAX_COEFFS);
   powy = newpoly(MAX_COEFFS);
   kxn = newpoly(MAX_COEFFS);
   kxd = newpoly(MAX_COEFFS);
   kyn = newpoly(MAX_COEFFS);
   kyd = newpoly(MAX_COEFFS);
   txn = newpoly(MAX_COEFFS);
   txd = newpoly(MAX_COEFFS);
   tyn = newpoly(MAX_COEFFS);
   tyd = newpoly(MAX_COEFFS);
   txn1 = newpoly(MAX_COEFFS);
   txd1 = newpoly(MAX_COEFFS);
   tyn1 = newpoly(MAX_COEFFS);
   tyd1 = newpoly(MAX_COEFFS);
   nx = newpoly(MAX_COEFFS);
   ny = newpoly(MAX_COEFFS);
   dx = newpoly(MAX_COEFFS);
   dy = newpoly(MAX_COEFFS);
   mn = newpoly(MAX_COEFFS);
   md = newpoly(MAX_COEFFS);
   tmp1 = newpoly(MAX_COEFFS);
   tmp2 = newpoly(MAX_COEFFS);
   tmp3 = newpoly(MAX_COEFFS);
   tmp4 = newpoly(MAX_COEFFS);
   mcand = (giant *)newa(MAX_COEFFS);
/* The next three need extra space for remaindering routines. */
        qscratch = newpoly(2*MAX_COEFFS);
        rscratch = newpoly(2*MAX_COEFFS);
        pbuff = newpoly(2*MAX_COEFFS);
        pbuff2 = newpoly(MAX_COEFFS);
        sscratch = newpoly(MAX_COEFFS);
        tscratch = newpoly(MAX_COEFFS);
        tmp = newgiant(MAX_DIGS);
        err = newgiant(MAX_DIGS);
        coe = newgiant(MAX_DIGS);
        aux = newgiant(MAX_DIGS);
        aux2 = newgiant(MAX_DIGS);
        t3 = newgiant(MAX_DIGS);
        t4 = newgiant(MAX_DIGS);
        t5 = newgiant(MAX_DIGS);
   		cubic = newpoly(4);
        precip = newpoly(MAX_COEFFS);
}

void 
atoa(giant *a, giant *b, int n) {
        int j;
        for(j=0; j<n; j++) gtog(a[j], b[j]);
}

void
ptop(poly x, poly y)
/* y := x. */
{
        y->deg = x->deg;
        atoa(x->coe, y->coe, y->deg+1);
}

void
negp(poly y)
/* y := -y. */
{       int j;
        for(j=0; j <= y->deg; j++) {
                negg(y->coe[j]);
                quickmodg(p, y->coe[j]);
        }
}

int
iszer(giant a) {

  if(a->sign == 0) return(1);
  return(0);

}

int
iszerop(poly x) {
   if(x->deg == 0 && iszer(x->coe[0])) return 1;
   return 0;
}

int
is0(giant a) {
        return(iszer(a));
}

int
is1(giant a) {
        return(isone(a));
}


void
addp(poly x, poly y)
/* y += x. */
{
        int d = x->deg, j;

        if(y->deg > d) d = y->deg;
        for(j = 0; j <= d; j++) {
                if((j <= x->deg) && (j <= y->deg)) {
                        addg(x->coe[j], y->coe[j]);
                        quickmodg(p, y->coe[j]);
                        continue;
                }
                if((j <= x->deg) && (j > y->deg)) {
                        gtog(x->coe[j], y->coe[j]);
                        quickmodg(p, y->coe[j]);
                        continue;
                }
        }
        y->deg = d;
        justifyp(y);
}

void
subp(poly x, poly y)
/* y -= x. */
{
        int d = x->deg, j;

        if(y->deg > d) d = y->deg;
        for(j = 0; j <= d; j++) {
                if((j <= x->deg) && (j <= y->deg)) {
                        subg(x->coe[j], y->coe[j]);
                        quickmodg(p, y->coe[j]);
                        continue;
                }
                if((j <= x->deg) && (j > y->deg)) {
                        gtog(x->coe[j], y->coe[j]);
                        negg(y->coe[j]);
                        quickmodg(p, y->coe[j]);
                        continue;
                }
        }
        y->deg = d;
        justifyp(y);
}


void
grammarmulp(poly a, poly b) 
/* b *= a. */
{
        int dega = a->deg, degb = b->deg, deg = dega + degb;
        register int d, kk, first, diffa;

        for(d=deg; d>=0; d--) {
                diffa = d-dega;
                itog(0, coe);
                for(kk=0; kk<=d; kk++) {
                        if((kk>degb)||(kk<diffa)) continue;
                        gtog(b->coe[kk], tmp);
                        mulg(a->coe[d-kk], tmp);
                        modg(p, tmp);
                        addg(tmp, coe);
                        quickmodg(p, coe);
                }
                gtog(coe, mcand[d]);
        }
        atoa(mcand, b->coe, deg+1);
        b->deg = deg;
        justifyp(b);
}

void
atop(giant *a, poly x, int deg)
/* Copy array to poly, with monic option. */ 
{
        int adeg = abs(deg);
        x->deg = adeg;
        atoa(a, x->coe, adeg);
        if(deg < 0) {
           itog(1, x->coe[adeg]);
        } else {
           gtog(a[adeg], x->coe[adeg]);
        }
}

void
just(giant g) {
   while((g->n[g->sign-1] == 0) && (g->sign > 0)) --g->sign;
}

void
unstuff_partial(giant g, poly y, int words){
        int j;
        for(j=0; j < y->deg; j++) {
                bcopy((g->n) + j*words, y->coe[j]->n, words*sizeof(short));
      y->coe[j]->sign = words;
      just(y->coe[j]);
   }
}

void
stuff(poly x, giant g, int words) {
        int deg = x->deg, j, coedigs;

   g->sign = words*(1 + deg);
   for(j=0; j <= deg; j++) {
                coedigs = (x->coe[j])->sign;
                bcopy(x->coe[j]->n, (g->n) + j*words, coedigs*sizeof(short));
      bzero((g->n) + (j*words+coedigs), 
                                sizeof(short)*(words-coedigs));
        }
   just(g);
}

int maxwords = 0;
void

binarysegmul(poly x, poly y) {
        int bits = bitlen(p), xwords, ywords, words;
   
        xwords = (2*bits + log_2(x->deg+1) + 32 + 15)/16;
   ywords = (2*bits + log_2(y->deg+1) + 32 + 15)/16;
   if(xwords > ywords) words = xwords; else words = ywords;
   stuff(x, globx, words);
   stuff(y, globy, words);
   mulg(globx, globy);
   gtog(y->coe[y->deg], globx);  /* Save high coeff. */
   y->deg += x->deg;
   gtog(globx, y->coe[y->deg]);  /* Move high coeff. */
   unstuff_partial(globy, y, words);
   mulg(x->coe[x->deg], y->coe[y->deg]); /* resolve high coeff. */
   justifyp(y);
}

binarysegsquare(poly y) {
        int bits = bitlen(p), words;
      words = (2*bits + log_2(y->deg+1) + 32 + 15)/16;
   stuff(y, globy, words);
   squareg(globy);
   gtog(y->coe[y->deg], globx);  /* Save high coeff. */
   y->deg += y->deg;
   gtog(globx, y->coe[y->deg]);  /* Move high coeff. */
   unstuff_partial(globy, y, words);
   mulg(y->coe[y->deg], y->coe[y->deg]); /* resolve high coeff. */
   justifyp(y);
}

void
assess(poly x, poly y){
        int max = 0, j;
   for(j=0; j <= x->deg; j++) {
                if(bitlen(x->coe[j]) > max) max = bitlen(x->coe[j]);
   }
   max = 0;
   for(j=0; j <= y->deg; j++) {
                if(bitlen(y->coe[j]) > max) max = bitlen(y->coe[j]);
   }
}

int
pcompp(poly x, poly y) {
    int j;
    if(x->deg != y->deg) return 1;
    for(j=0; j <= x->deg; j++) {
                        if(gcompg(x->coe[j], y->coe[j])) return 1;
    }
    return 0;
}

/*
int max_deg = 0;
*/

void
mulp(poly x, poly y)
/*  y *= x. */
{
        int n, degx = x->deg, degy = y->deg;

/*
if(degx > max_deg) {
	max_deg = degx; printf("xdeg: %d\n", degx);
}

if(degy > max_deg) {
	max_deg = degy; printf("ydeg: %d\n", degy);
}
*/
        if((degx < P_BREAK) || (degy < P_BREAK)) {
                grammarmulp(x,y);
                justifyp(y);
                return;
        }
   if(x==y) binarysegsquare(y);
   	else binarysegmul(x, y);
}

void
revp(poly x) 
/* Reverse the coefficients of x. */
{       int j, deg = x->deg;

        for(j=0; j <= deg/2; j++) {
                gtog(x->coe[deg-j], tmp);
                gtog(x->coe[j], x->coe[deg-j]);
                gtog(tmp, x->coe[j]);
        }
   justifyp(x);
}

void
recipp(poly f, int deg) 
/* f := 1/f, via newton method.  */
{
        int lim = deg + 1, prec = 1;

        sscratch->deg = 0; itog(1, sscratch->coe[0]);
        itog(1, aux);
        while(prec < lim) {
                prec <<= 1;
                if(prec > lim) prec = lim;
                f->deg = prec-1;
                ptop(sscratch, tscratch);
                mulp(f, tscratch); 
				tscratch->deg = prec-1;
				polyrem(tscratch);
                subg(aux, tscratch->coe[0]);
                quickmodg(p, tscratch->coe[0]);
                mulp(sscratch, tscratch); 
				tscratch->deg = prec-1;
				polyrem(tscratch);
                subp(tscratch, sscratch); 
                sscratch->deg = prec-1;
				polyrem(sscratch);
        }
        justifyp(sscratch);
        ptop(sscratch, f);
}

int
left_justifyp(poly x, int start)
/* Left-justify the polynomial, checking from position "start." */
{
        int j, shift = 0;

        for(j = start; j <= x->deg; j++) {
                if(!is0(x->coe[j])) {
                     shift = start;
                     break;
                }
        }
        x->deg -= shift;
        for(j=0; j<= x->deg; j++) {
                gtog(x->coe[j+shift], x->coe[j]);
        }
        return(shift);
}

void
remp(poly x, poly y, int mode)
/* y := x (mod y). 
  mode = 0 is normal operation,
       = 1 jams a fixed reciprocal,
       = 2 uses the fixed reciprocal.
 */
{
        int degx = x->deg, degy = y->deg, d, shift;

        if(degy == 0) {
                y->deg = 0;
                itog(0, y->coe[0]);
                return;
        }
        d = degx - degy;
        if(d < 0) {
                ptop(x, y);
                return;
        }
        revp(x); revp(y);
        ptop(y, rscratch);
        switch(mode) {
           case 0: recipp(rscratch, d);
                   break;
      case 1: recipp(rscratch, degy); /* degy -1. */
                   ptop(rscratch, precip);
                   rscratch->deg = d; justifyp(rscratch);
                   break;
           case 2: ptop(precip, rscratch);
                   rscratch->deg = d; justifyp(rscratch);
                   break;
        } 
/* Next, a limited-precision multiply. */
        if(d < degx) { x->deg = d; justifyp(x);}
        mulp(x, rscratch); 
        rscratch->deg = d; 
		polyrem(rscratch);
		justifyp(rscratch);
        x->deg = degx; justifyp(x);
        mulp(rscratch, y);
        subp(x, y);
        negp(y); polyrem(y);
        shift = left_justifyp(y, d+1);
   for(d = y->deg+1; d <= degx-shift; d++) itog(0, y->coe[d]);
        y->deg = degx - shift;
        revp(y);
        revp(x);
}

fullmod(poly x) {
   polyrem(x);
   ptop(smonic, s[0]);
   remp(x, s[0], 2);
   ptop(s[0], x);
   polyrem(x);
}

scalarmul(giant s, poly x) {
        int j;
   for(j=0; j <= x->deg; j++) {
                mulg(s, x->coe[j]);
      modg(p, x->coe[j]);
   }
}


schain(int el) {
   int j;

        s[0]->deg = 0;
   itog(0, s[0]->coe[0]);

        s[1]->deg = 0;
   itog(1, s[1]->coe[0]);
        s[2]->deg = 0;
   itog(2, s[2]->coe[0]);

   s[3]->deg = 4;
   gtog(a, aux); mulg(a, aux); negg(aux);   
   gtog(aux, s[3]->coe[0]);
   gtog(b, aux); smulg(12, aux);
   gtog(aux, s[3]->coe[1]);
   gtog(a, aux); smulg(6, aux);
   gtog(aux, s[3]->coe[2]);
   itog(0, s[3]->coe[3]);
   itog(3, s[3]->coe[4]);

   s[4]->deg = 6;
   gtog(a, aux); mulg(a, aux); mulg(a, aux);
   gtog(b, tmp); mulg(b, tmp); smulg(8, tmp); addg(tmp, aux);
   negg(aux);   
   gtog(aux, s[4]->coe[0]);
   gtog(b, aux); mulg(a, aux); smulg(4, aux); negg(aux);
   gtog(aux, s[4]->coe[1]);
   gtog(a, aux); mulg(a, aux); smulg(5, aux); negg(aux);
   gtog(aux, s[4]->coe[2]);
   gtog(b, aux); smulg(20, aux);
   gtog(aux, s[4]->coe[3]);
   gtog(a, aux); smulg(5, aux);
   gtog(aux, s[4]->coe[4]);
   itog(0, s[4]->coe[5]);
   itog(1, s[4]->coe[6]);
   itog(4, aux);
   scalarmul(aux, s[4]);
   cubic->deg = 3;
   itog(1, cubic->coe[3]);
   itog(0, cubic->coe[2]);
   gtog(a, cubic->coe[1]);
   gtog(b, cubic->coe[0]);
   for(j=5; j <= el; j++) {
        if(j % 2 == 0) {
                                ptop(s[j/2 + 2], s[j]); mulp(s[j/2-1], s[j]);
            polyrem(s[j]); mulp(s[j/2-1], s[j]); polyrem(s[j]);
            ptop(s[j/2-2], s[0]); mulp(s[j/2+1], s[0]); polyrem(s[0]);
            mulp(s[j/2+1], s[0]); polyrem(s[0]);
            subp(s[0], s[j]); mulp(s[j/2], s[j]); polyrem(s[j]);
                           gtog(p, aux); iaddg(1, aux); gshiftright(1, aux);
                                scalarmul(aux, s[j]);
        } else {
            ptop(s[(j-1)/2+2], s[j]);
            mulp(s[(j-1)/2], s[j]); 
polyrem(s[j]);
            mulp(s[(j-1)/2], s[j]); 
polyrem(s[j]);
            mulp(s[(j-1)/2], s[j]); 
polyrem(s[j]);
            ptop(s[(j-1)/2-1], s[0]);
            mulp(s[(j-1)/2 + 1], s[0]); polyrem(s[0]);
            mulp(s[(j-1)/2 + 1], s[0]); polyrem(s[0]);
            mulp(s[(j-1)/2 + 1], s[0]); polyrem(s[0]);
            if(((j-1)/2) % 2 == 1) {
                                        mulp(cubic, s[0]); polyrem(s[0]);
                                        mulp(cubic, s[0]); polyrem(s[0]);
            } else {
                                        mulp(cubic, s[j]); polyrem(s[j]);
                                        mulp(cubic, s[j]); polyrem(s[j]);
            }
// polyout(s[1]); polyout(s[3]); polyout(s[0]); polyout(s[j]);
            subp(s[0], s[j]);
            polyrem(s[j]);
        }
   }
}      

init_recip(int el) {
   int j;
   ptop(s[el], smonic); 
   if(el == 2) {
		mulp(cubic, smonic); polyrem(smonic);
   }
   gtog(smonic->coe[smonic->deg], aux); /* High coeff. */
   binvg(p, aux);
   scalarmul(aux, smonic);  /* s is now monic. */
   s[0]->deg = smonic->deg + 1;
   for(j=0; j <= s[0]->deg; j++) itog(1, s[0]->coe[j]);
   ptop(smonic, pbuff);
   remp(s[0], pbuff, 1);  /* Initialize reciprocal of s as precip. */
}

/* void powerpoly(poly x, giant n)
{       int len, pos;
        ptop(x, pbuff);
        x->deg = 0; itog(1, x->coe[0]);
        len = bitlen(n);
        pos = 0;
        while(1) {
                if(bitval(n, pos++)) {
                        mulp(pbuff, x);
                        fullmod(x);
                }
                if(pos>=len) break;
                mulp(pbuff, pbuff);
                fullmod(pbuff);
        }
}
*/

void powerpoly(poly x, giant n)
/* Base-4 window. */
{       int pos, code;
        ptop(x, pbuff);  /* x. */
	ptop(pbuff, pbuff2);
	mulmod(pbuff2, pbuff2); mulmod(pbuff, pbuff2); /* x^3. */ 
        pos = bitlen(n)-2;
        while(pos >= 0) {
		mulmod(x, x);
		if(pos==0) {
			if(bitval(n, pos) != 0) {
				mulmod(pbuff, x);
			}
			break;
		}
		code = (bitval(n, pos) != 0) * 2 + (bitval(n, pos-1) != 0);
		switch(code) {
			case 0: mulmod(x,x); break;
			case 1: mulmod(x,x); 
				mulmod(pbuff, x);
				break;
			case 2: mulmod(pbuff, x); 
				mulmod(x,x); break;
			case 3: mulmod(x,x); mulmod(pbuff2, x); break;
		}
		pos -= 2;
        }
}

mulmod(poly x, poly y) {
        mulp(x, y); fullmod(y);
}

elldoublep(poly n1, poly d1, poly m1, poly c1, poly n0, poly d0,
                poly m0, poly c0) {

     ptop(n1, mn); mulmod(n1, mn);
          ptop(mn, pbuff); addp(mn, mn); addp(pbuff, mn); 
     fullmod(mn);
          ptop(d1, pbuff); mulmod(d1, pbuff);
          scalarmul(a, pbuff); addp(pbuff, mn);
          fullmod(mn);
          mulmod(c1, mn);
          ptop(m1, md); addp(md, md);
          mulmod(d1, md); mulmod(d1, md); mulmod(cubic, md);

     ptop(d1, n0); mulmod(mn, n0); mulmod(mn, n0);
          mulmod(cubic, n0);
          ptop(n1, pbuff); addp(pbuff, pbuff); fullmod(pbuff);
          mulmod(md, pbuff); mulmod(md, pbuff);
          subp(pbuff, n0); fullmod(n0);
     ptop(md, d0); mulmod(md, d0); mulmod(d1, d0);

     ptop(mn, m0); mulmod(c1, m0);
          ptop(d0, pbuff); mulmod(n1, pbuff);
          ptop(n0, c0); mulmod(d1, c0); subp(c0, pbuff);
          fullmod(pbuff);
          mulmod(pbuff, m0);
          ptop(m1, pbuff); mulmod(md, pbuff);
          mulmod(d1, pbuff); mulmod(d0, pbuff);
          subp(pbuff, m0); fullmod(m0);

     ptop(c1, c0); mulmod(md, c0); mulmod(d1, c0); mulmod(d0, c0);
}

elladdp(poly n1, poly d1, poly m1, poly c1, poly n2, poly d2, poly m2, poly c2, poly n0, poly d0, poly m0, poly c0) {
        ptop(m2, mn); mulmod(c1, mn);
   ptop(m1, pbuff); mulmod(c2, pbuff);
   subp(pbuff, mn); fullmod(mn);
   mulmod(d1, mn); mulmod(d2, mn);

        ptop(n2, md); mulmod(d1, md);
   ptop(n1, pbuff); mulmod(d2, pbuff);
   subp(pbuff, md); fullmod(md);
   mulmod(c1, md); mulmod(c2, md);

   ptop(cubic, n0); mulmod(mn, n0); mulmod(mn, n0); 
   mulmod(d1, n0); mulmod(d2, n0);
   ptop(n1, pbuff); mulmod(d2, pbuff);
   ptop(n2, d0); mulmod(d1, d0);
   addp(d0, pbuff); mulmod(md, pbuff); mulmod(md, pbuff);
   subp(pbuff, n0); fullmod(n0);

   ptop(md, d0); mulmod(md, d0); mulmod(d1, d0); mulmod(d2, d0);

   ptop(mn, m0); mulmod(c1, m0);
   ptop(d0, pbuff); mulmod(n1, pbuff);
   ptop(d1, c0); mulmod(n0, c0);
   subp(c0, pbuff); fullmod(pbuff);
   mulmod(pbuff, m0);
   ptop(m1, pbuff); mulmod(md, pbuff);
   mulmod(d0, pbuff); mulmod(d1, pbuff);
   subp(pbuff, m0); fullmod(m0);

   ptop(md, c0); mulmod(c1, c0); mulmod(d0, c0); mulmod(d1, c0);

}   

polyout(poly x) {
   int j;
   for(j=0; j <= x->deg; j++) {printf("%d: ",j); gout(x->coe[j]);}
}

main(int argc, char **argv) {
      int j, ct = 0, el, xmatch, ymatch;
      int k, t;
      int T[L_CEIL], P[L_CEIL], LL[L_CEIL];
      giant ss[L_CEIL];
      unsigned int ord, ordminus;
      point_proj pt, pt2;

      p = newgiant(INFINITY);  /* Also sets up internal giants' stacks. */
      j = ((Q_MAX+15)/16);
      init_tools(2*j);
	  a = newgiant(j);
	  b = newgiant(j);
      
entry:
      printf("Give p > 3, a, b on separate lines:\n"); fflush(stdout);
      gin(p);  /* Field prime. */
      if((Q = bitlen(p)) > Q_MAX) {
		fprintf(stderr, "p too large, larger than %d bits.\n", Q);
		goto entry;
	  }
      if(!prime_probable(p)) {
		fprintf(stderr, "p is not but must be prime.\n", Q);
		goto entry;
	  }
      gin(a); gin(b); /* Curve parameters. */

	  t1 = newgiant(2*j);
	  t2 = newgiant(2*j);
/* Next, discriminant test for legitimacy of curve. */
	gtog(a, t1); squareg(t1); modg(p, t1); mulg(a, t1); modg(p, t1);
    gshiftleft(2, t1);  /* 4 a^3 mod p. */
    gtog(b, t2); squareg(t2); modg(p, t2); smulg(27, t2);
    addg(t2, t1); modg(p, t1);
    if(isZero(t1)) {
		fprintf(stderr, "Discriminant FAILED\n");
		goto entry;
    }
    printf("Discriminant PASSED\n"); fflush(stdout);

/* Next, find an efficient prime power array such that
   Prod[powers] >= 16 p. */

 /* Create minimal prime power array such that Prod[powers]^2 > 16p. */

    gtog(p, t2); gshiftleft(4, t2);   /* t2 := 16p. */

    L_MAX = 3;
    while(L_MAX <= L_CEIL-1) {
		for(j=0; j <= L_MAX; j++) LL[j] = 0;
   		for(j=2; j <= L_MAX; j++) {
			if(primeq(j)) { 
				LL[j] = 1;
		    	k = j;
				while(1) {
			    	k *= j;
					if(k <= L_MAX) {
						LL[k] = 1;
						LL[k/j] = 0;
					}
					else break;
				}
			}
		}
    	itog(1, t1);
    	for(j=2; j <= L_MAX; j++) {
			if(LL[j]) { smulg(j, t1); smulg(j, t1); } /* Building up t1^2. */
		}
		if(gcompg(t1, t2) > 0) break;
        ++L_MAX;
	}

   printf("Initializing for prime powers:\n"); 
   for(j=2; j <= L_MAX; j++) {
		if(LL[j]) printf("%d ", j);
   }
   printf("\n");
   fflush(stdout);


   MAX_DIGS = (2 + (Q+15)/8);  /* Size of (squared) coefficients. */   
   MAX_COEFFS = ((L_MAX+1)*(L_MAX+2));

   init_all();
   schain(L_MAX+1);

for(L = 2; L <= L_MAX; L++) {
      if(!LL[L]) continue;
printf("Resolving Schoof L = %d...\n", L);
      P[ct] = L;  /* Stuff another prime power. */
      init_recip(L);
// printf("s: "); polyout(s[L]);
      gtog(p, aux2);
      k = idivg(L, aux2);  /* p (mod L). */

printf("power...\n");
      txd->deg = 0; itog(1, txd->coe[0]);
      tyd->deg = 0; itog(1, tyd->coe[0]);
      txn->deg = 1; itog(0, txn->coe[0]); itog(1, txn->coe[1]);
      ptop(txn, kxn);
      
      gtog(p, aux2);
      powerpoly(txn, aux2); /* x^p. */
printf("x^p done...\n");
      ptop(txn, powx);
      powerpoly(powx, aux2);
printf("x^p^2 done...\n");
      ptop(cubic, tyn);
      gtog(p, aux2); itog(1, aux); subg(aux, aux2);
      gshiftright(1, aux2); /* aux2 := (p-1)/2. */
      powerpoly(tyn, aux2); /* y^p. */
printf("y^p done...\n");
      ptop(tyn, powy); mulp(tyn, powy); fullmod(powy);
      mulp(cubic, powy); fullmod(powy);
      powerpoly(powy, aux2);
      mulp(tyn, powy); fullmod(powy);
printf("Powers done...\n");

// printf("pow" ); polyout(powx); polyout(powy);
      ptop(txn, txn1); ptop(txd, txd1);  /* Save t = 1 case. */
      ptop(tyn, tyn1); ptop(tyd, tyd1);
/* We now shall test
     (powx, y powy) + k(kxn/kxd, y kyn/kyd) = t(txn/txd, y tyn/tyd)
 */

    if(k==1) { ptop(txd, kxd); ptop(txd, kyd);
                              ptop(txd, kyn);
    } else {
                   ptop(s[k], kxd); mulp(s[k], kxd); fullmod(kxd);
     if(k%2==0) { mulp(cubic, kxd); fullmod(kxd); }
     mulp(kxd, kxn); fullmod(kxn);
     ptop(s[k-1], pbuff); mulp(s[k+1], pbuff); fullmod(pbuff);
     if(k%2==1) {mulp(cubic, pbuff); fullmod(pbuff); }
     subp(pbuff, kxn); fullmod(kxn);

     ptop(s[k+2], kyn); mulp(s[k-1], kyn); fullmod(kyn);
          mulp(s[k-1], kyn); fullmod(kyn);
     if(k > 2) {
                ptop(s[k-2], pbuff); mulp(s[k+1], pbuff); fullmod(pbuff);
          mulp(s[k+1], pbuff); fullmod(pbuff);
     subp(pbuff, kyn); fullmod(kyn);
     }
     ptop(s[k], kyd); mulp(s[k], kyd); fullmod(kyd);     
          mulp(s[k], kyd); fullmod(kyd);
     if(k%2==0) { mulp(cubic, kyd); fullmod(kyd);
                        mulp(cubic, kyd); fullmod(kyd);}
     itog(4, aux2); scalarmul(aux2, kyd);
    }
//printf("kP: "); polyout(kxn); polyout(kxd); polyout(kyn); polyout(kyd);
/* Commence t = 0 check. */
printf("Checking t = %d ...\n", 0);
fflush(stdout);

     ptop(powx, pbuff); mulp(kxd, pbuff);
     subp(kxn, pbuff); 
     fullmod(pbuff);

     xmatch = ymatch = 0;
     if(iszerop(pbuff)) {
		 xmatch = 1;
         /* Now check y coords. */
 		 if(L == 2) goto resolve;
         ptop(powy, pbuff); mulp(kyd, pbuff);
         addp(kyn, pbuff);
         fullmod(pbuff);
         if(iszerop(pbuff)) {
               resolve:
					printf("%d %d\n", L, 0);
               		T[ct++] = 0;
                    continue;
         } else ymatch = 1;
     }
/* Combine pt1 and pt2. */
   if((xmatch == 1) && (ymatch == 1)) 
       elldoublep(powx, txd, powy, txd, nx, dx, ny, dy);
       else
       elladdp(powx, txd, powy, txd, kxn, kxd, kyn, kyd, nx, dx, ny, dy);
/* Now {nx/dx, ny/dy} is (fixed) LHS. */
// printf("add12: "); polyout(nx); polyout(dx); polyout(ny); polyout(dy);
/* Commence t > 0 check. */
    for(t=1; t <= L/2; t++) {
printf("Checking t = %d ...\n", t);
         if(t > 1) { /* Add (tx1, ty1) to (txn, tyn). */
                                ptop(txn1, pbuff); mulmod(txd, pbuff);
            ptop(txn, powx); mulmod(txd1, powx);
                                subp(powx, pbuff); fullmod(pbuff);
            if(!iszerop(pbuff))
                                 elladdp(txn1, txd1, tyn1, tyd1, txn, txd, tyn, tyd,
                                        tmp1, tmp2, tmp3, tmp4);
                           else elldoublep(txn, txd, tyn, tyd,
                                        tmp1, tmp2, tmp3, tmp4); 
            ptop(tmp1, txn); ptop(tmp2, txd);
                           ptop(tmp3, tyn); ptop(tmp4, tyd);
         }
// printf("tQ: "); polyout(txn); polyout(txd); polyout(tyn); polyout(tyd);
         /* Next, check {nx/dx, ny/dy} =? {txn/txd, tyn/tyd}. */
                        ptop(nx, pbuff); mulmod(txd, pbuff);
                   ptop(dx, powx); mulmod(txn, powx);
                        subp(powx, pbuff); fullmod(pbuff);
                   if(!iszerop(pbuff)) continue;
         /* Next, check y. */
                //      printf("y check!\n");
                        ptop(ny, pbuff); mulmod(tyd, pbuff);
                   ptop(dy, powx); mulmod(tyn, powx);
                        subp(powx, pbuff); fullmod(pbuff);
                   if(iszerop(pbuff)) {
                        printf("%d %d\n", L, t);
               			T[ct++] = t;
                   }  else {
                        printf("%d %d\n", L, L-t);
               			T[ct++] = L-t;
         			}
					fflush(stdout);
         			break;
   }
}

/* Now, prime powers P[] and CRT residues T[] are intact. */
	printf("Prime powers L:\n");
	printf("{");
    for(j=0; j < ct-1; j++) {
		printf("%d, ", P[j]);
    }
    printf("%d }\n", P[ct-1]);

	printf("Residues t (mod L):\n");
	printf("{");
    for(j=0; j < ct-1; j++) {
		printf("%d, ", T[j]);
    }
    printf("%d }\n", T[ct-1]);

/* Mathematica algorithm for order: 
plis = {2^5, 3^3, 5^2, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47};
tlis = {1,    26,   4, 2,  4, 11,  6,  5, 19, 22, 10, 16,  7, 22, 11};
prod = Apply[Times, plis];
prlis = prod/plis;
invlis = Table[PowerMod[prlis[[q]], -1, plis[[q]]],{q,1,Length[plis]}];
p = 2^127 - 1;
t = Mod[tlis . (prlis * invlis), prod];
ord = p + 1 - If[t^2 > 4p, t - prod, t]
*/

  itog(1, t1);
  for(j=0; j < ct; j++) {
    free(s[j]);  /* Just to clear memory. */
	smulg(P[j], t1);
  }

  for(j=0; j < 2*ct; j++) {
	   ss[j] = newgiant(MAX_DIGS);
  }

  for(j=0; j < ct; j++) {
	gtog(t1, ss[j]);
    itog(P[j], t2);
    divg(t2, ss[j]);
  }

  for(j=0; j < ct; j++) {
       gtog(ss[j], ss[j+ct]);
       itog(P[j], t2);
	   invg(t2, ss[j+ct]);
  }

  itog(0, t4);
  for(j=0; j < ct; j++) {
	itog(T[j], t5);
	mulg(ss[j], t5);
	mulg(ss[j+ct], t5);
	addg(t5, t4);
  }
  modg(t1, t4);
  gtog(p, t5);
  iaddg(1, t5);
  gtog(t4, t2);
  squareg(t4);
  gtog(p, t3); gshiftleft(2, t3);
  if(gcompg(t4, t3) > 0) subg(t1, t2);
  subg(t2, t5);
  printf("Parameters:\n");
  printf("p = "); gout(p);
  printf("a = "); gout(a);
  printf("b = "); gout(b);
  printf("Curve order:\n");
  printf("o = "); gout(t5); gtog(t5, t3); /* Save order as t3. */
  printf("Twist order:\n");
  printf("o' = ");
  addg(t2, t5);
  addg(t2, t5);
  gout(t5);
/* Next, verify the order. */
  printf("Verifying order o:...\n");
  init_ell_proj(MAX_DIGS);
  pt = new_point_proj(MAX_DIGS);
  pt2 = new_point_proj(MAX_DIGS);
  itog(1,t2);
  find_point_proj(pt, t2, a, b, p);
  printf("A point on the curve y^2 = x^3 + a x + b (mod p) is:\n");
  printf("(x,y,z) = {\n"); gout(pt->x); printf(",");
  gout(pt->y); printf(","); gout(pt->z);  
  printf("}\n");
  ell_mul_proj(pt, pt2, t3, a, p);
  printf("A multiple is:\n");
  printf("o * (x,y,z) = {\n");
  gout(pt2->x); printf(",");gout(pt2->y); printf(",");gout(pt2->z);  
  printf("}\n");
  if(!isZero(pt2->z)) {
	printf("TILT: multiple should be point-at-infinity.\n");
	exit(1);
  }
  printf("VERIFIED: multiple is indeed point-at-infinity.\n");
}
